{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
    "from sklearn import linear_model\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "import scipy\n",
    "from sklearn.metrics import log_loss\n",
    "import xgboost as xgb\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import roc_auc_score\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>qid1</th>\n",
       "      <th>qid2</th>\n",
       "      <th>question1</th>\n",
       "      <th>question2</th>\n",
       "      <th>is_duplicate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>What is the step by step guide to invest in sh...</td>\n",
       "      <td>What is the step by step guide to invest in sh...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>What is the story of Kohinoor (Koh-i-Noor) Dia...</td>\n",
       "      <td>What would happen if the Indian government sto...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>How can I increase the speed of my internet co...</td>\n",
       "      <td>How can Internet speed be increased by hacking...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>Why am I mentally very lonely? How can I solve...</td>\n",
       "      <td>Find the remainder when [math]23^{24}[/math] i...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>Which one dissolve in water quikly sugar, salt...</td>\n",
       "      <td>Which fish would survive in salt water?</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  qid1  qid2                                          question1  \\\n",
       "0   0     1     2  What is the step by step guide to invest in sh...   \n",
       "1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   \n",
       "2   2     5     6  How can I increase the speed of my internet co...   \n",
       "3   3     7     8  Why am I mentally very lonely? How can I solve...   \n",
       "4   4     9    10  Which one dissolve in water quikly sugar, salt...   \n",
       "\n",
       "                                           question2  is_duplicate  \n",
       "0  What is the step by step guide to invest in sh...             0  \n",
       "1  What would happen if the Indian government sto...             0  \n",
       "2  How can Internet speed be increased by hacking...             0  \n",
       "3  Find the remainder when [math]23^{24}[/math] i...             0  \n",
       "4            Which fish would survive in salt water?             0  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('quora_train.csv')\n",
    "df = df.dropna(how=\"any\").reset_index(drop=True)\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x1ea602c84e0>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAEHCAYAAABSjBpvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEjNJREFUeJzt3X+s3XV9x/Hny1acDhWUSliLK9EuE11EbYDofjBZoLBlxQ0y2CIdNqszkGiim2iygD9IMIuasSkLhEoxTmSoo3HV2iHOGRF6kQpU1N4gSi2BahFxRB343h/nc/VwOb3303srp3ifj+Sb8z3vz4/v5yRtX/n+OKepKiRJ6vGUcS9AkvTkYWhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSeq2eNwL2N8OO+ywWr58+biXIUlPKrfccsv3qmrJbP1+5UJj+fLlTExMjHsZkvSkkuTbPf28PCVJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqduv3Jf7niyWn/+f417Cr5S7L/7jcS9BWhBmPdNIcmSSG5LcmWR7kje2+oVJvptkW9tOHRrztiSTSb6R5OSh+qpWm0xy/lD9qCQ3JdmR5GNJDmr1p7X3k619+f788JKkfdNzeeoR4M1V9SLgeODcJEe3tvdX1TFt2wTQ2s4EXgysAj6YZFGSRcAHgFOAo4GzhuZ5T5trBfAAsLbV1wIPVNULgfe3fpKkMZk1NKrq3qr6Stt/CLgTWDrDkNXA1VX1k6r6FjAJHNu2yaq6q6p+ClwNrE4S4NXAtW38BuC0obk2tP1rgRNbf0nSGOzTjfB2eehlwE2tdF6S25KsT3Joqy0F7hkatrPV9lZ/LvCDqnpkWv0xc7X2B1t/SdIYdIdGkoOBjwNvqqofApcCLwCOAe4F3jvVdcTwmkN9prmmr21dkokkE7t3757xc0iS5q4rNJI8lUFgfKSqPgFQVfdV1aNV9TPgcgaXn2BwpnDk0PBlwK4Z6t8DDkmyeFr9MXO19mcDe6avr6ouq6qVVbVyyZJZfw5ekjRHPU9PBbgCuLOq3jdUP2Ko22uAO9r+RuDM9uTTUcAK4GZgK7CiPSl1EIOb5RurqoAbgNPb+DXAdUNzrWn7pwOfa/0lSWPQ8z2NVwGvBW5Psq3V3s7g6adjGFwuuht4PUBVbU9yDfA1Bk9enVtVjwIkOQ/YDCwC1lfV9jbfW4Grk7wbuJVBSNFeP5xkksEZxpnz+KySpHmaNTSq6ouMvrewaYYxFwEXjahvGjWuqu7iF5e3hus/Bs6YbY2SpCeGPyMiSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKnbrKGR5MgkNyS5M8n2JG9s9eck2ZJkR3s9tNWT5JIkk0luS/LyobnWtP47kqwZqr8iye1tzCVJMtMxJEnj0XOm8Qjw5qp6EXA8cG6So4HzgeuragVwfXsPcAqwom3rgEthEADABcBxwLHABUMhcGnrOzVuVavv7RiSpDGYNTSq6t6q+krbfwi4E1gKrAY2tG4bgNPa/mrgqhr4MnBIkiOAk4EtVbWnqh4AtgCrWtuzqurGqirgqmlzjTqGJGkM9umeRpLlwMuAm4DDq+peGAQL8LzWbSlwz9Cwna02U33niDozHGP6utYlmUgysXv37n35SJKkfdAdGkkOBj4OvKmqfjhT1xG1mkO9W1VdVlUrq2rlkiVL9mWoJGkfdIVGkqcyCIyPVNUnWvm+dmmJ9np/q+8EjhwavgzYNUt92Yj6TMeQJI1Bz9NTAa4A7qyq9w01bQSmnoBaA1w3VD+7PUV1PPBgu7S0GTgpyaHtBvhJwObW9lCS49uxzp4216hjSJLGYHFHn1cBrwVuT7Kt1d4OXAxck2Qt8B3gjNa2CTgVmAQeBs4BqKo9Sd4FbG393llVe9r+G4ArgacDn24bMxxDkjQGs4ZGVX2R0fcdAE4c0b+Ac/cy13pg/Yj6BPCSEfXvjzqGJGk8/Ea4JKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSus0aGknWJ7k/yR1DtQuTfDfJtradOtT2tiSTSb6R5OSh+qpWm0xy/lD9qCQ3JdmR5GNJDmr1p7X3k619+f760JKkuek507gSWDWi/v6qOqZtmwCSHA2cCby4jflgkkVJFgEfAE4BjgbOan0B3tPmWgE8AKxt9bXAA1X1QuD9rZ8kaYxmDY2q+gKwp3O+1cDVVfWTqvoWMAkc27bJqrqrqn4KXA2sThLg1cC1bfwG4LShuTa0/WuBE1t/SdKYzOeexnlJbmuXrw5ttaXAPUN9drba3urPBX5QVY9Mqz9mrtb+YOsvSRqTxXMcdynwLqDa63uB1wGjzgSK0eFUM/RnlrbHSLIOWAfw/Oc/f6Z1S5rNhc8e9wp+tVz44LhXsF/N6Uyjqu6rqker6mfA5QwuP8HgTOHIoa7LgF0z1L8HHJJk8bT6Y+Zq7c9mL5fJquqyqlpZVSuXLFkyl48kSeowp9BIcsTQ29cAU09WbQTObE8+HQWsAG4GtgIr2pNSBzG4Wb6xqgq4ATi9jV8DXDc015q2fzrwudZfkjQms16eSvJR4ATgsCQ7gQuAE5Icw+By0d3A6wGqanuSa4CvAY8A51bVo22e84DNwCJgfVVtb4d4K3B1kncDtwJXtPoVwIeTTDI4wzhz3p9WkjQvs4ZGVZ01onzFiNpU/4uAi0bUNwGbRtTv4heXt4brPwbOmG19kqQnjt8IlyR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUbdbQSLI+yf1J7hiqPSfJliQ72uuhrZ4klySZTHJbkpcPjVnT+u9Ismao/ookt7cxlyTJTMeQJI1Pz5nGlcCqabXzgeuragVwfXsPcAqwom3rgEthEADABcBxwLHABUMhcGnrOzVu1SzHkCSNyayhUVVfAPZMK68GNrT9DcBpQ/WrauDLwCFJjgBOBrZU1Z6qegDYAqxqbc+qqhurqoCrps016hiSpDGZ6z2Nw6vqXoD2+rxWXwrcM9RvZ6vNVN85oj7TMR4nybokE0kmdu/ePcePJEmazf6+EZ4RtZpDfZ9U1WVVtbKqVi5ZsmRfh0uSOs01NO5rl5Zor/e3+k7gyKF+y4Bds9SXjajPdAxJ0pjMNTQ2AlNPQK0Brhuqn92eojoeeLBdWtoMnJTk0HYD/CRgc2t7KMnx7amps6fNNeoYkqQxWTxbhyQfBU4ADkuyk8FTUBcD1yRZC3wHOKN13wScCkwCDwPnAFTVniTvAra2fu+sqqmb629g8ITW04FPt40ZjiFJGpNZQ6OqztpL04kj+hZw7l7mWQ+sH1GfAF4yov79UceQJI2P3wiXJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktRtXqGR5O4ktyfZlmSi1Z6TZEuSHe310FZPkkuSTCa5LcnLh+ZZ0/rvSLJmqP6KNv9kG5v5rFeSND/740zjD6vqmKpa2d6fD1xfVSuA69t7gFOAFW1bB1wKg5ABLgCOA44FLpgKmtZn3dC4VfthvZKkOfplXJ5aDWxo+xuA04bqV9XAl4FDkhwBnAxsqao9VfUAsAVY1dqeVVU3VlUBVw3NJUkag/mGRgGfTXJLknWtdnhV3QvQXp/X6kuBe4bG7my1meo7R9QfJ8m6JBNJJnbv3j3PjyRJ2pvF8xz/qqraleR5wJYkX5+h76j7ETWH+uOLVZcBlwGsXLlyZB9J0vzN60yjqna11/uBTzK4J3Ffu7REe72/dd8JHDk0fBmwa5b6shF1SdKYzDk0kvx6kmdO7QMnAXcAG4GpJ6DWANe1/Y3A2e0pquOBB9vlq83ASUkObTfATwI2t7aHkhzfnpo6e2guSdIYzOfy1OHAJ9tTsIuBf6uqzyTZClyTZC3wHeCM1n8TcCowCTwMnANQVXuSvAvY2vq9s6r2tP03AFcCTwc+3TZJ0pjMOTSq6i7gpSPq3wdOHFEv4Ny9zLUeWD+iPgG8ZK5rlCTtX34jXJLUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0O+NBIsirJN5JMJjl/3OuRpIXsgA6NJIuADwCnAEcDZyU5eryrkqSF64AODeBYYLKq7qqqnwJXA6vHvCZJWrAO9NBYCtwz9H5nq0mSxmDxuBcwi4yo1eM6JeuAde3tj5J845e6qoXlMOB7417EbPKeca9AY/Ck+LPJO0b9M3ZA+s2eTgd6aOwEjhx6vwzYNb1TVV0GXPZELWohSTJRVSvHvQ5pOv9sjseBfnlqK7AiyVFJDgLOBDaOeU2StGAd0GcaVfVIkvOAzcAiYH1VbR/zsiRpwTqgQwOgqjYBm8a9jgXMy346UPlncwxS9bj7ypIkjXSg39OQJB1ADA1JUrcD/p6GnjhJfpvBN+6XMvg+zC5gY1XdOdaFSTpgeKYhAJK8lcHPtAS4mcHjzgE+6g9FSprijXABkOSbwIur6v+m1Q8CtlfVivGsTJpZknOq6kPjXsdC4ZmGpvwM+I0R9SNam3Sgese4F7CQeE9DU94EXJ9kB7/4kcjnAy8EzhvbqiQgyW17awIOfyLXstB5eUo/l+QpDH6OfimDv4w7ga1V9ehYF6YFL8l9wMnAA9ObgC9V1aizZP0SeKahn6uqnwFfHvc6pBE+BRxcVdumNyT5/BO/nIXLMw1JUjdvhEuSuhkakqRuhoYWrCRfmuf4v07yL/MYf3eSw+azliSnJTl6rmuQ9pWhoQWrql457jVMmcdaTgMMDT1hDA0tWEl+1F6PSPKFJNuS3JHk92YYc06Sbyb5b+BVQ/Urk5w+Yu4T2tyfTPK1JP/aHm0euZa2//dJbk/y1SQXt9rfJNnaah9P8owkrwT+FPjHtvYXtO0zSW5J8j/t98Sk/cZHbiX4S2BzVV2UZBHwjFGdkhzB4NvHrwAeBG4Abu2Y/1gGZwPfBj4D/Blw7V6OcQqDs4fjqurhJM9pTZ+oqstbn3cDa6vqn5NsBD5VVde2tuuBv62qHUmOAz4IvLpjjVIXQ0Ma/Djj+iRPBf5j1HcBmuOAz1fVboAkHwN+q2P+m6vqrjbmo8DvspfQAP4I+FBVPQxQVXta/SUtLA4BDmbwXyA/RpKDgVcC/55kqvy0jvVJ3bw8pQWvqr4A/D7wXeDDSc6eqfte6o/Q/j5l8C/2QTOMmenLUdlL+5XAeVX1OwzOdn5tRJ+nAD+oqmOGthfNcCxpnxkaWvCS/CZwf7v8cwXw8r10vQk4Iclz21nJGUNtdzO4bAWD/5PkqUNtxyY5qt3L+AvgizMs57PA65I8o61t6vLUM4F723H/aqj/Q62Nqvoh8K0kZ7SxSfLSGY4l7TNDQ4ITgG1JbgX+HPinUZ2q6l7gQuBG4L+Arww1Xw78QZKbGVzG+t+hthuBi4E7gG8Bn9zbQqrqM8BGYCLJNuAtrekfGITWFuDrQ0OuBv4uya1JXsAgUNYm+SqwnUGASfuNPyMi/RIlOQF4S1X9ybjXIu0PnmlIkrp5piGNkOQmHv/k0Wur6vZxrEc6UBgakqRuXp6SJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1+38qOCjM258g7gAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x1ea4641b470>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df.groupby(\"is_duplicate\")['id'].count().plot.bar()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.drop(['id', 'qid1', 'qid2'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "What is the step by step guide to invest in share market in india?\n",
      "What is the step by step guide to invest in share market?\n",
      "\n",
      "What is the story of Kohinoor (Koh-i-Noor) Diamond?\n",
      "What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?\n",
      "\n",
      "How can I increase the speed of my internet connection while using a VPN?\n",
      "How can Internet speed be increased by hacking through DNS?\n",
      "\n",
      "Why am I mentally very lonely? How can I solve it?\n",
      "Find the remainder when [math]23^{24}[/math] is divided by 24,23?\n",
      "\n",
      "Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?\n",
      "Which fish would survive in salt water?\n",
      "\n",
      "Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?\n",
      "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?\n",
      "\n",
      "Should I buy tiago?\n",
      "What keeps childern active and far from phone and video games?\n",
      "\n",
      "How can I be a good geologist?\n",
      "What should I do to be a great geologist?\n",
      "\n",
      "When do you use シ instead of し?\n",
      "When do you use \"&\" instead of \"and\"?\n",
      "\n",
      "Motorola (company): Can I hack my Charter Motorolla DCX3400?\n",
      "How do I hack Motorola DCX3400 for free internet?\n",
      "\n"
     ]
    }
   ],
   "source": [
    "a = 0 \n",
    "for i in range(a,a+10):\n",
    "    print(df.question1[i])\n",
    "    print(df.question2[i])\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "SPECIAL_TOKENS = {\n",
    "    'quoted': 'quoted_item',\n",
    "    'non-ascii': 'non_ascii_word',\n",
    "    'undefined': 'something'\n",
    "}\n",
    "\n",
    "def clean(text, stem_words=True):\n",
    "    import re\n",
    "    from string import punctuation\n",
    "    from nltk.stem import SnowballStemmer\n",
    "    from nltk.corpus import stopwords\n",
    "    \n",
    "    def pad_str(s):\n",
    "        return ' '+s+' '\n",
    "    \n",
    "    if pd.isnull(text):\n",
    "        return ''\n",
    "\n",
    "#    stops = set(stopwords.words(\"english\"))\n",
    "    # Clean the text, with the option to stem words.\n",
    "    \n",
    "    # Empty question\n",
    "    \n",
    "    if type(text) != str or text=='':\n",
    "        return ''\n",
    "\n",
    "    # Clean the text\n",
    "    text = re.sub(\"\\'s\", \" \", text) # we have cases like \"Sam is\" or \"Sam's\" (i.e. his) these two cases aren't separable, I choose to compromise are kill \"'s\" directly\n",
    "    text = re.sub(\" whats \", \" what is \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(\"\\'ve\", \" have \", text)\n",
    "    text = re.sub(\"can't\", \"can not\", text)\n",
    "    text = re.sub(\"n't\", \" not \", text)\n",
    "    text = re.sub(\"i'm\", \"i am\", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(\"\\'re\", \" are \", text)\n",
    "    text = re.sub(\"\\'d\", \" would \", text)\n",
    "    text = re.sub(\"\\'ll\", \" will \", text)\n",
    "    text = re.sub(\"e\\.g\\.\", \" eg \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(\"b\\.g\\.\", \" bg \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(\"(\\d+)(kK)\", \" \\g<1>000 \", text)\n",
    "    text = re.sub(\"e-mail\", \" email \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(\"(the[\\s]+|The[\\s]+)?U\\.S\\.A\\.\", \" America \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(\"(the[\\s]+|The[\\s]+)?United State(s)?\", \" America \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(\"\\(s\\)\", \" \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(\"[c-fC-F]\\:\\/\", \" disk \", text)\n",
    "    \n",
    "    # remove comma between numbers, i.e. 15,000 -> 15000\n",
    "    \n",
    "    text = re.sub('(?<=[0-9])\\,(?=[0-9])', \"\", text)\n",
    "    \n",
    "#     # all numbers should separate from words, this is too aggressive\n",
    "    \n",
    "#     def pad_number(pattern):\n",
    "#         matched_string = pattern.group(0)\n",
    "#         return pad_str(matched_string)\n",
    "#     text = re.sub('[0-9]+', pad_number, text)\n",
    "    \n",
    "    # add padding to punctuations and special chars, we still need them later\n",
    "    \n",
    "    text = re.sub('\\$', \" dollar \", text)\n",
    "    text = re.sub('\\%', \" percent \", text)\n",
    "    text = re.sub('\\&', \" and \", text)\n",
    "    \n",
    "#    def pad_pattern(pattern):\n",
    "#        matched_string = pattern.group(0)\n",
    "#       return pad_str(matched_string)\n",
    "#    text = re.sub('[\\!\\?\\@\\^\\+\\*\\/\\,\\~\\|\\`\\=\\:\\;\\.\\#\\\\\\]', pad_pattern, text) \n",
    "        \n",
    "    text = re.sub('[^\\x00-\\x7F]+', pad_str(SPECIAL_TOKENS['non-ascii']), text) # replace non-ascii word with special word\n",
    "    \n",
    "    # indian dollar\n",
    "    \n",
    "    text = re.sub(\"(?<=[0-9])rs \", \" rs \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(\" rs(?=[0-9])\", \" rs \", text, flags=re.IGNORECASE)\n",
    "    \n",
    "    # clean text rules get from : https://www.kaggle.com/currie32/the-importance-of-cleaning-text\n",
    "    text = re.sub(r\" (the[\\s]+|The[\\s]+)?US(A)? \", \" America \", text)\n",
    "    text = re.sub(r\" UK \", \" England \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" india \", \" India \", text)\n",
    "    text = re.sub(r\" switzerland \", \" Switzerland \", text)\n",
    "    text = re.sub(r\" china \", \" China \", text)\n",
    "    text = re.sub(r\" chinese \", \" Chinese \", text) \n",
    "    text = re.sub(r\" imrovement \", \" improvement \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" intially \", \" initially \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" quora \", \" Quora \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" dms \", \" direct messages \", text, flags=re.IGNORECASE)  \n",
    "    text = re.sub(r\" demonitization \", \" demonetization \", text, flags=re.IGNORECASE) \n",
    "    text = re.sub(r\" actived \", \" active \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" kms \", \" kilometers \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" cs \", \" computer science \", text, flags=re.IGNORECASE) \n",
    "    text = re.sub(r\" upvote\", \" up vote\", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" iPhone \", \" phone \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" \\0rs \", \" rs \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" calender \", \" calendar \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" ios \", \" operating system \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" gps \", \" GPS \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" gst \", \" GST \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" programing \", \" programming \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" bestfriend \", \" best friend \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" dna \", \" DNA \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" III \", \" 3 \", text)\n",
    "    text = re.sub(r\" banglore \", \" Banglore \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" J K \", \" JK \", text, flags=re.IGNORECASE)\n",
    "    text = re.sub(r\" J\\.K\\. \", \" JK \", text, flags=re.IGNORECASE)\n",
    "    \n",
    "    # replace the float numbers with a random number, it will be parsed as number afterward, and also been replaced with word \"number\"\n",
    "    \n",
    "    text = re.sub('[0-9]+\\.[0-9]+', \" 87 \", text)\n",
    "  \n",
    "    \n",
    "    # Remove punctuation from text\n",
    "    text = ''.join([c for c in text if c not in punctuation]).lower()\n",
    "       # Return a list of words\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['question1'] = df['question1'].apply(clean)\n",
    "df['question2'] = df['question2'].apply(clean)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "what is the step by step guide to invest in share market in india\n",
      "what is the step by step guide to invest in share market\n",
      "\n",
      "what is the story of kohinoor kohinoor diamond\n",
      "what would happen if the indian government stole the kohinoor kohinoor diamond back\n",
      "\n",
      "how can i increase the speed of my internet connection while using a vpn\n",
      "how can internet speed be increased by hacking through dns\n",
      "\n",
      "why am i mentally very lonely how can i solve it\n",
      "find the remainder when math2324math is divided by 2423\n",
      "\n",
      "which one dissolve in water quikly sugar salt methane and carbon di oxide\n",
      "which fish would survive in salt water\n",
      "\n",
      "astrology i am a capricorn sun cap moon and cap risingwhat does that say about me\n",
      "i am a triple capricorn sun moon and ascendant in capricorn what does this say about me\n",
      "\n",
      "should i buy tiago\n",
      "what keeps childern active and far from phone and video games\n",
      "\n",
      "how can i be a good geologist\n",
      "what should i do to be a great geologist\n",
      "\n",
      "when do you use  nonasciiword  instead of  nonasciiword \n",
      "when do you use  and  instead of and\n",
      "\n",
      "motorola company can i hack my charter motorolla dcx3400\n",
      "how do i hack motorola dcx3400 for free internet\n",
      "\n"
     ]
    }
   ],
   "source": [
    "a = 0 \n",
    "for i in range(a,a+10):\n",
    "    print(df.question1[i])\n",
    "    print(df.question2[i])\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### BOW + Xgboost Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "count_vect = CountVectorizer(analyzer='word', token_pattern=r'\\w{1,}')\n",
    "count_vect.fit(pd.concat((df['question1'],df['question2'])).unique())\n",
    "trainq1_trans = count_vect.transform(df['question1'].values)\n",
    "trainq2_trans = count_vect.transform(df['question2'].values)\n",
    "labels = df['is_duplicate'].values\n",
    "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n",
    "y = labels\n",
    "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)\n",
    "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n",
    "xgb_prediction = xgb_model.predict(X_valid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training score: 0.6177597850983563\n",
      "validation score: 0.6154032008574583\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.70      0.95      0.80     84267\n",
      "           1       0.77      0.30      0.43     49148\n",
      "\n",
      "   micro avg       0.71      0.71      0.71    133415\n",
      "   macro avg       0.73      0.62      0.62    133415\n",
      "weighted avg       0.72      0.71      0.66    133415\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import f1_score, classification_report, accuracy_score\n",
    "\n",
    "print('training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n",
    "print('validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n",
    "print(classification_report(y_valid, xgb_prediction))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Word level TF-IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\\w{1,}', max_features=5000)\n",
    "tfidf_vect.fit(pd.concat((df['question1'],df['question2'])).unique())\n",
    "trainq1_trans = tfidf_vect.transform(df['question1'].values)\n",
    "trainq2_trans = tfidf_vect.transform(df['question2'].values)\n",
    "labels = df['is_duplicate'].values\n",
    "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n",
    "y = labels\n",
    "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "word level tf-idf training score: 0.8493408114951853\n",
      "word level tf-idf validation score: 0.7576508867065961\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.79      0.90      0.84     84267\n",
      "           1       0.77      0.60      0.67     49148\n",
      "\n",
      "   micro avg       0.79      0.79      0.79    133415\n",
      "   macro avg       0.78      0.75      0.76    133415\n",
      "weighted avg       0.79      0.79      0.78    133415\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import f1_score, classification_report, accuracy_score\n",
    "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n",
    "xgb_prediction = xgb_model.predict(X_valid)\n",
    "print('word level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n",
    "print('word level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n",
    "print(classification_report(y_valid, xgb_prediction))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### N-gram Level TF-IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\\w{1,}', ngram_range=(2,3), max_features=5000)\n",
    "tfidf_vect_ngram.fit(pd.concat((df['question1'],df['question2'])).unique())\n",
    "trainq1_trans = tfidf_vect_ngram.transform(df['question1'].values)\n",
    "trainq2_trans = tfidf_vect_ngram.transform(df['question2'].values)\n",
    "labels = df['is_duplicate'].values\n",
    "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n",
    "y = labels\n",
    "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "n-gram level tf-idf training score: 0.7193864239031045\n",
      "n-gram level tf-idf validation score: 0.67470696099733\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.73      0.92      0.81     84267\n",
      "           1       0.75      0.42      0.54     49148\n",
      "\n",
      "   micro avg       0.73      0.73      0.73    133415\n",
      "   macro avg       0.74      0.67      0.67    133415\n",
      "weighted avg       0.74      0.73      0.71    133415\n",
      "\n"
     ]
    }
   ],
   "source": [
    "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n",
    "xgb_prediction = xgb_model.predict(X_valid)\n",
    "print('n-gram level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n",
    "print('n-gram level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n",
    "print(classification_report(y_valid, xgb_prediction))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Character Level TF-IDF "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "character level tf-idf training score: 0.9844717869102682\n",
      "character level tf-idf validation score: 0.8008380950798113\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.83      0.91      0.87     84267\n",
      "           1       0.81      0.67      0.74     49148\n",
      "\n",
      "   micro avg       0.82      0.82      0.82    133415\n",
      "   macro avg       0.82      0.79      0.80    133415\n",
      "weighted avg       0.82      0.82      0.82    133415\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import f1_score, classification_report, accuracy_score\n",
    "tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\\w{1,}', ngram_range=(2,3), max_features=5000)\n",
    "tfidf_vect_ngram_chars.fit(pd.concat((df['question1'],df['question2'])).unique())\n",
    "trainq1_trans = tfidf_vect_ngram_chars.transform(df['question1'].values)\n",
    "trainq2_trans = tfidf_vect_ngram_chars.transform(df['question2'].values)\n",
    "labels = df['is_duplicate'].values\n",
    "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n",
    "y = labels\n",
    "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)\n",
    "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n",
    "xgb_prediction = xgb_model.predict(X_valid)\n",
    "print('character level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n",
    "print('character level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n",
    "print(classification_report(y_valid, xgb_prediction))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
